# read RNA expression counts and patient data
load(file="data/readMatrix_PBMC_BulkRNAseq_CVID.Rdata")
if (!file.exists("translated_genes.txt")) {
source("gene_translation.R")
}
mt.counts.df <- read.csv("translated_genes.txt", row.names = 1)
groups <- sample.info$disease
groups <- ifelse(groups %in% c("UCVID", "CVID"),
"Deficient",
as.character(groups))
# store index of NCVID subjects
remove_NCVID_index <- which(groups == "NCVID")
# update group label vector
groups <- groups[groups != "NCVID"]
groups <- factor(groups)
# check modified vector
print(groups)
## [1] healthy healthy healthy healthy healthy healthy Deficient
## [8] Deficient Deficient Deficient Deficient Deficient Deficient Deficient
## [15] Deficient Deficient Deficient Deficient Deficient Deficient Deficient
## [22] Deficient Deficient Deficient Deficient Deficient Deficient Deficient
## [29] Deficient Deficient Deficient Deficient Deficient Deficient healthy
## [36] healthy healthy healthy healthy healthy healthy healthy
## Levels: Deficient healthy
#updating dataframes
mt.counts.df <- mt.counts.df[,-remove_NCVID_index]
sample.info <- sample.info[-remove_NCVID_index,]
sample.info$disease <- groups
# re-store results
counts <- mt.counts.df
meta_data <- sample.info
# isolate group indicator variable
group <- meta_data$disease
# aggregate patient and expression data into a single object
se_cvid <- SummarizedExperiment(assays=list(counts=counts),
colData = meta_data)
# isolating treatment group experiments
treatments <- meta_data$treatment
ctrl_index <- which(treatments == "ctrl")
LPS_index <- which(treatments == "LPS")
counts_ctrl <- counts[,-LPS_index]
counts_LPS <- counts[,-ctrl_index]
meta_data_ctrl <- meta_data[-LPS_index,]
meta_data_LPS <- meta_data[-ctrl_index,]
se_cvid_ctrl <- SummarizedExperiment(assays=list(counts=counts_ctrl),
colData = meta_data_ctrl)
se_cvid_LPS <- SummarizedExperiment(assays=list(counts=counts_LPS),
colData = meta_data_LPS)
# create counts per million, log counts and log counts per million features
# and save them into new assays
se_cvid <- mkAssay(se_cvid, log = TRUE, counts_to_CPM = TRUE)
# same for control only
se_cvid_ctrl <- mkAssay(se_cvid_ctrl, log = TRUE, counts_to_CPM = TRUE)
# same for LPS treatment group only
se_cvid_LPS <- mkAssay(se_cvid_LPS, log = TRUE, counts_to_CPM = TRUE)
# display all assays, control and LPS treatment
assays(se_cvid)
## List of length 4
## names(4): counts log_counts counts_cpm log_counts_cpm
# display all assays, control only
assays(se_cvid_ctrl)
## List of length 4
## names(4): counts log_counts counts_cpm log_counts_cpm
# display all assays, LPS treatment only
assays(se_cvid_LPS)
## List of length 4
## names(4): counts log_counts counts_cpm log_counts_cpm
# fit PCA model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid,"log_counts_cpm")))
# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$disease <- as.factor(se_cvid$disease) # color/group
# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=disease)) +
geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
geom_text(label=colnames(se_cvid), nudge_y = 5,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA Plot - Entire Sample")
plot(g)
# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid,"log_counts_cpm")), config=umap.defaults)
# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$disease <- as.factor(se_cvid$disease) # color / group
# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=disease)) +
geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
geom_text(label=colnames(se_cvid), nudge_y = 0.1,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("UMAP Plot - Entire Sample")
plot(g)
# fit PCA model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid,"log_counts_cpm")))
# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$treatment <- as.factor(se_cvid$treatment) # color/group
# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=treatment)) +
geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
geom_text(label=colnames(se_cvid), nudge_y = 5,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA Plot - Entire Sample with Treatment Groups")
plot(g)
# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid,"log_counts_cpm")), config=umap.defaults)
# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$treatment <- as.factor(se_cvid$treatment) # color / group
# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=treatment)) +
geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
geom_text(label=colnames(se_cvid), nudge_y = 0.1,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("UMAP Plot - Entire Sample with Treatment Groups")
plot(g)
As we can observe, the treatment variable overwhelms the dimension reduction, indicating we should run separate analysis for each treatment group.
# fit PCA model to log cpm data for LPS treatment group
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid_LPS,"log_counts_cpm")))
# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$disease <- as.factor(se_cvid_LPS$disease) # color/group
# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=disease)) +
geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
geom_text(label=colnames(se_cvid_LPS), nudge_y = 5,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA Plot - LPS group")
plot(g)
# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid_LPS,"log_counts_cpm")), config=umap.defaults)
# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$disease <- as.factor(se_cvid_LPS$disease) # color / group
# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=disease)) +
geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
geom_text(label=colnames(se_cvid_LPS), nudge_y = 0.1,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("UMAP Plot - LPS group")
plot(g)
# format data into a DESeq2-friendly Data Structure
dds <- DESeqDataSetFromMatrix(countData=counts, colData=meta_data, design=~disease)
#colData is a data frame of demographic/phenotypic data
# filter for genes with significant expression
# expression should be > 0, but > 100 should be better
dds<-dds[rowSums(counts(dds))>1,] #Gene Filtering
# fit negative binomial regression
dds<-DESeq(dds) #Performs estimation of size factors, dispersion, and negative binomial GLM f#itting
# extracting results, ordering them by adjusted p-value
res <- results(dds)[order(results(dds)[,6]),]
#res[1:10,]
# dislpay results for the top 1000 most significant genes
datatable(data.frame(res[1:1000,]))
# store the name of the top 250 most significant genes
deseq250 <- rownames(res)[1:250]
LPS.deseq.res <- data.frame(res)
# Make a Heatmap of DEGs
# All together: extract the log cpm of the top 250 genes and store that in a matrix
# # get the name of the genes ordered by adjusted p-val
top_genes = rownames(results(dds)[order(results(dds)$padj),])[1:250]
# # convert log cpm to a matrix structure sorted by the top genes
mat = as.matrix(assay(se_cvid_LPS,"log_counts_cpm"))[top_genes,]
# normalize data row-wise
mat = t(scale(t(mat)))
# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_LPS)$disease)
# create HeatMap with some annotations
ha_colors = list(disease=c("healthy"="Blue","Deficient"="Red"))
ha = HeatmapAnnotation(df = df, col = ha_colors)
Heatmap(mat,show_row_names=F,show_column_names = F, top_annotation=ha)
# collapse data into healthy or not
# Store data in Limma-friendly Data Structure
dge <- DGEList(counts=counts, group=group) #From edgeR, Computes library size
# filter for genes with significant expression
counts<-counts[which(rowSums(cpm(counts))>1),] #Gene Filtering #(mask)
dge <- DGEList(counts=counts, group=group) #Re-compute library size #(filter)
# Trimmed-Mean Normalization of data
dge <- calcNormFactors(dge) #TMM normalization
# Create design matrix (intercept + indicator variable)
design<-model.matrix(~group)
# distribution normalizing transformation
v<-voom(dge, design) #voom transform to calculate weights to eliminate mean-variance #relationship
#use usual limma pipelines
# fit empirical bayes regression model
fit<-lmFit(v,design)
fit<-eBayes(fit)
# organize data by top ranked genes when predicting IDC (coef = col#2 = UCVID)
# topTable(fit, coef=2)
# display top 1000 most significant genes
datatable(topTable(fit, coef=2, number=1000))
# store the name of the top 250 most significant genes
limma250 <- rownames(topTable(fit, coef=2, number=250))
LPS.limma.res <- data.frame(topTable(fit, coef=2))
# Make a Heatmap of DEGs
# extract the log cpm of the top 250 genes and store that in a matrix
# get the name of the top 250 most significant genes
top_genes = rownames(topTable(fit, coef=2, number=250))
# convert log cpm to a matrix structure
mat = as.matrix(assay(se_cvid_LPS,"log_counts_cpm"))[top_genes,]
# normalize data row-wise
mat = t(scale(t(mat)))
# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_LPS)$disease)
# create HeatMap with some annotations
ha = HeatmapAnnotation(df = df, col=ha_colors)
Heatmap(mat,show_row_names=F,show_column_names=F, top_annotation=ha)
# based on the results of DEseq, cross-reference the gene with pathway databases to infer which pathways are the most significant
enriched <- enrichr(deseq250, dbs)
## Uploading data to Enrichr... Done.
## Querying WikiPathways_2016... Done.
## Querying KEGG_2016... Done.
## Querying Panther_2016... Done.
## Querying Reactome_2016... Done.
## Parsing results... Done.
datatable(enriched$WikiPathways_2016)
datatable(enriched$KEGG_2016)
datatable(enriched$Panther_2016)
datatable(enriched$Reactome_2016)
enriched <- enrichr(limma250, dbs)
## Uploading data to Enrichr... Done.
## Querying WikiPathways_2016... Done.
## Querying KEGG_2016... Done.
## Querying Panther_2016... Done.
## Querying Reactome_2016... Done.
## Parsing results... Done.
datatable(enriched$WikiPathways_2016)
datatable(enriched$KEGG_2016)
datatable(enriched$Panther_2016)
datatable(enriched$Reactome_2016)
# fit PCA model to log cpm data for healthy control group
# note: transpose data to make it tidy
set.seed(1)
pca_out <- prcomp(t(assay(se_cvid_ctrl,"log_counts_cpm")))
# define dataframe with PCA components and immune deficiency disease (IDC)
pca_plot <- as.data.frame(pca_out$x) # X, Y, ...
pca_plot$disease <- as.factor(se_cvid_ctrl$disease) # color/group
# plot PC1 and PC2 by IDC
g <- pca_plot %>% ggplot(aes(x=PC1, y=PC2, color=disease)) +
geom_point(size=1.5) + xlab("PC 1") + ylab("PC 2") +
geom_text(label=colnames(se_cvid_ctrl), nudge_y = 5,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("PCA Plot - Control group")
plot(g)
# fit UMAP model to log cpm data
# note: transpose data to make it tidy
set.seed(1)
umap.defaults$n_neighbors=5
umap_out <- umap(t(assay(se_cvid_ctrl,"log_counts_cpm")), config=umap.defaults)
# define dataframe with UMAP dimensions and IDC
umap_plot <- as.data.frame(umap_out$layout) # X, Y, ...
umap_plot$disease <- as.factor(se_cvid_ctrl$disease) # color / group
# plot UMAP1 and UMAP2 by IDC
g <- umap_plot %>% ggplot(aes(x=V1, y=V2, color=disease)) +
geom_point(size=1.5) + xlab("UMAP1") + ylab("UMAP2") +
geom_text(label=colnames(se_cvid_ctrl), nudge_y = 0.1,) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("UMAP Plot - Control group")
plot(g)
# format data into a DESeq2-friendly Data Structure
dds <- DESeqDataSetFromMatrix(countData=counts, colData=meta_data, design=~disease)
#colData is a data frame of demographic/phenotypic data
# filter for genes with significant expression
# expression should be > 0, but > 100 should be better
dds<-dds[rowSums(counts(dds))>1,] #Gene Filtering
# fit negative binomial regression
dds<-DESeq(dds) #Performs estimation of size factors, dispersion, and negative binomial GLM f#itting
# extracting results, ordering them by adjusted p-value
res <- results(dds)[order(results(dds)[,6]),]
#res[1:10,]
# display results for the top 1000 most significant genes
datatable(data.frame(res[1:1000,]))
# store the name of the top 250 most significant genes
deseq250 <- rownames(res)[1:250]
PBMC.deseq.res <- data.frame(res)
# Make a Heatmap of DEGs
# All together: extract the log cpm of the top 250 genes and store that in a matrix
# # get the name of the genes ordered by adjusted p-val
top_genes = rownames(results(dds)[order(results(dds)$padj),])[1:250]
# # convert log cpm to a matrix structure sorted by the top genes
mat = as.matrix(assay(se_cvid_ctrl,"log_counts_cpm"))[top_genes,]
# normalize data row-wise
mat = t(scale(t(mat)))
# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_ctrl)$disease)
# create HeatMap with some annotations
ha_colors = list(disease=c("healthy"="Blue","Deficient"="Red"))
ha = HeatmapAnnotation(df = df, col = ha_colors)
Heatmap(mat,show_row_names=F,show_column_names = F, top_annotation=ha)
# collapse data into healthy or not
# Store data in Limma-friendly Data Structure
dge <- DGEList(counts=counts, group=group) #From edgeR, Computes library size
# filter for genes with significant expression
counts<-counts[which(rowSums(cpm(counts))>1),] #Gene Filtering #(mask)
dge <- DGEList(counts=counts, group=group) #Re-compute library size #(filter)
# Trimmed-Mean Normalization of data
dge <- calcNormFactors(dge) #TMM normalization
# Create design matrix (intercept + indicator variable)
design<-model.matrix(~group)
# distribution normalizing transformation
v<-voom(dge, design) #voom transform to calculate weights to eliminate mean-variance #relationship
#use usual limma pipelines
# fit empirical bayes regression model
fit<-lmFit(v,design)
fit<-eBayes(fit)
# organize data by top ranked genes when predicting IDC (coef = col#2 = UCVID)
# topTable(fit, coef=2)
# display top 1000 most significant genes
datatable(topTable(fit, coef=2, number=1000))
# store the name of the top 250 most significant genes
limma250 <- rownames(topTable(fit, coef=2, number=250))
PBMC.limma.res <- data.frame(topTable(fit, coef=2))
# Make a Heatmap of DEGs
# extract the log cpm of the top 250 genes and store that in a matrix
# get the name of the top 250 most significant genes
top_genes = rownames(topTable(fit, coef=2, number=250))
# convert log cpm to a matrix structure
mat = as.matrix(assay(se_cvid_ctrl,"log_counts_cpm"))[top_genes,]
# normalize data row-wise
mat = t(scale(t(mat)))
# store IDC into a single-column dataframe
df=data.frame(disease=colData(se_cvid_ctrl)$disease)
# create HeatMap with some annotations
ha = HeatmapAnnotation(df = df, col=ha_colors)
Heatmap(mat,show_row_names=F,show_column_names=F, top_annotation=ha)
# based on the results of DEseq, cross-reference the gene with pathway databases to infer which pathways are the most significant
enriched <- enrichr(deseq250, dbs)
## Uploading data to Enrichr... Done.
## Querying WikiPathways_2016... Done.
## Querying KEGG_2016... Done.
## Querying Panther_2016... Done.
## Querying Reactome_2016... Done.
## Parsing results... Done.
datatable(enriched$WikiPathways_2016)
datatable(enriched$KEGG_2016)
datatable(enriched$Panther_2016)
datatable(enriched$Reactome_2016)
enriched <- enrichr(limma250, dbs)
## Uploading data to Enrichr... Done.
## Querying WikiPathways_2016... Done.
## Querying KEGG_2016... Done.
## Querying Panther_2016... Done.
## Querying Reactome_2016... Done.
## Parsing results... Done.
datatable(enriched$WikiPathways_2016)
datatable(enriched$KEGG_2016)
datatable(enriched$Panther_2016)
datatable(enriched$Reactome_2016)
#combine result lists into dataframe
top.250.results.df <- data.frame(ctrl_deseq=ctrl_deseq,
ctrl_limma=ctrl_limma,
LPS_deseq=LPS_deseq,
LPS_limma=LPS_limma)
# obtain project directory
project_root <- rprojroot::find_root(has_dir("Meta-omic-Analysis-Capstone"))
# obtain comparison analysis directory
relative_path <- file.path("Meta-omic-Analysis-Capstone/Deliverable 6")
# file output paths
output.path.1 <- fs::path(project_root,
relative_path, "data",
"PBMC_CVID_top250_results.csv")
output.path.2 <- fs::path(project_root,
relative_path, "data",
"PBMC_CVID_all_results.csv")
# write .csv file into Deliverable 6 folder
write.csv(x=top.250.results.df, file=output.path.1, row.names = T)
write.csv(x=PBMC.deseq.res, file=output.path.2, row.names = T)